from pyspark import SparkConf, SparkContext
import os

# 设置python解释器环境变量
os.environ['PYSPARK_PYTHON'] = 'D:/PYTHON/python3.10/python.exe'

conf = SparkConf().setMaster('local[*]').setAppName('my_test_spark')
sc = SparkContext(conf=conf)

rdd = sc.textFile('E:/PythonBasicKnowledge/12 pyspark使用/统计单词数量.txt')
# print(rdd.collect())

rdd1 = rdd.flatMap(lambda x: x.split(' '))

# 去重
rdd2 = rdd1.distinct()

print(rdd2.collect())

sc.stop()
